library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(stringr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(gsubfn)
## Warning: package 'gsubfn' was built under R version 4.0.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5
Read in the data set
injury <- read.csv("../../Data/all_injuries_clean.csv")
players <- read.csv("../../Data/all_player_demographic_clean.csv")
# Merge data
injuries = left_join(injury, players, by = c("name", "team", "year", "full_team"))
head(injuries)
dim(injuries)
## [1] 17387 34
injuries[11:18]
Gathering into long format
injury_gather <- gather(injuries, key = "bodypart", value = "counts", 11:18)
injury_gather
Overall distribution of body part injuries
injury_gather %>%
group_by(bodypart) %>%
summarise(counts = sum(counts)) %>%
ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
geom_col() +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25, fontface = 'bold') +
xlab("Body Part") +
ylab("Count") +
ggtitle("Distriubtion of Injuries by Body Part") +
theme_economist() +
theme(axis.title.x = element_text(size = 16, vjust = -3),
axis.title.y = element_text(size = 16, vjust = 3),
title = element_text(size = 20),
plot.title = element_text(hjust = 0.5)
)
Now for the next part at EDA, let’s look at the injury distributions across the various positions in football
levels(as.factor(injuries$position_id))
## [1] "" "DEF" "K" "OL" "P" "QB" "RB" "TE" "WR"
injuries %>% filter(position_id == "K")
We see we have the following positions: Kicker (K), Offensive Line (OL), Punter (P), Quarter Back (QB), Running Back (RB), Tight End (TE), Wide Reciever (WR) and Defense (DEF).
Since Defense has it’s own category with no specific position (like Linebacker, Defensive Line or Safety), let’s first compare the injury distributions between Offensive Players and Defensive Players
offensive_position <- c("K", "OL", "P", "QB", "RB", "TE", "WR")
# Only offensive players
offense <- injury_gather %>% filter(position_id %in% offensive_position)
# Only defensive players
defense <- injury_gather %>% filter(position_id == "DEF")
levels(as.factor(offense$position_id))
## [1] "K" "OL" "P" "QB" "RB" "TE" "WR"
levels(as.factor(defense$position_id))
## [1] "DEF"
dim(offense)
## [1] 61768 28
dim(defense)
## [1] 58568 28
We have 7712 offensive players and 72309 defensive players which is great since the data sets are somewhat balanced and therefore comparing them will be valid.
Overall distribution of offensive injuries
offense %>%
group_by(bodypart) %>%
summarise(counts = sum(counts)) %>%
ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
geom_col() +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25, fontface = 'bold') +
xlab("Body Part") +
ylab("Count") +
ggtitle("Distriubtion of Offensive Injuries") +
theme_economist() +
theme(axis.title.x = element_text(size = 16, vjust = -3),
axis.title.y = element_text(size = 16, vjust = 3),
title = element_text(size = 20),
plot.title = element_text(hjust = 0.5)
)
Overall distribution of defensive injuries
defense %>%
group_by(bodypart) %>%
summarise(counts = sum(counts)) %>%
ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
geom_col() +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25, fontface = 'bold') +
xlab("Body Part") +
ylab("Count") +
ggtitle("Distriubtion of Defensive Injuries") +
theme_economist() +
theme(axis.title.x = element_text(size = 16, vjust = -3),
axis.title.y = element_text(size = 16, vjust = 3),
title = element_text(size = 20),
plot.title = element_text(hjust = 0.5)
)
Distribution of injuries for kickers
offense %>%
filter(position_id == "K") %>%
group_by(bodypart) %>%
summarise(counts = sum(counts)) %>%
ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
geom_text(aes(label = counts), position=position_dodge(width = 0.9), vjust = -0.25, fontface='bold') +
geom_col() +
xlab("Body Part") +
ylab("Count") +
ggtitle("Distribution of Offensive Injuries") +
theme_economist() +
theme(
axis.title.x = element_text(size = 14, vjust = -3),
axis.title.y = element_text(size = 14, vjust = 3),
title = element_text(size = 18),
plot.title = element_text(hjust = 0.5)
)